!pip install spotipyCollecting spotipy
Downloading spotipy-2.25.1-py3-none-any.whl.metadata (5.1 kB)
Collecting redis>=3.5.3 (from spotipy)
Downloading redis-5.2.1-py3-none-any.whl.metadata (9.1 kB)
Requirement already satisfied: requests>=2.25.0 in /usr/local/lib/python3.11/dist-packages (from spotipy) (2.32.3)
Requirement already satisfied: urllib3>=1.26.0 in /usr/local/lib/python3.11/dist-packages (from spotipy) (2.3.0)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests>=2.25.0->spotipy) (3.4.1)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests>=2.25.0->spotipy) (3.10)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests>=2.25.0->spotipy) (2025.1.31)
Downloading spotipy-2.25.1-py3-none-any.whl (31 kB)
Downloading redis-5.2.1-py3-none-any.whl (261 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 261.5/261.5 kB 11.6 MB/s eta 0:00:00
# Import necessary libraries
from scipy.spatial.distance import cdist
import difflib
from collections import defaultdict
import os
import numpy as np
import pandas as pd
from yellowbrick.target import FeatureCorrelation
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from scipy.spatial.distance import cdist
import warnings
# Ignore warnings
warnings.filterwarnings("ignore")# Load datasets
data = pd.read_csv("/content/data.csv")
genre_data = pd.read_csv("/content/data_by_genres.csv")
year_data = pd.read_csv("/content/data_by_year.csv")
# Define feature columns
feature_names = [
"acousticness", "danceability", "energy", "instrumentalness",
"liveness", "loudness", "speechiness", "tempo", "valence",
"duration_ms", "explicit", "key", "mode", "year",
]
# Extract features (X) and target (y)
X, y = data[feature_names], data["popularity"]
# Convert feature names to a NumPy array
features = np.array(feature_names)# Instantiate the FeatureCorrelation visualizer
visualizer = FeatureCorrelation(labels=features)
# Set figure size
plt.rcParams["figure.figsize"] = (20, 20)
# Fit the visualizer and show the plot
visualizer.fit(X, y)
visualizer.show()
<Axes: title={'center': 'Features correlation with dependent variable'}, xlabel='Pearson Correlation'>
# Function to extract decade from year
def get_decade(year):
period_start = int(year / 10) * 10
return "{}s".format(period_start)
# Apply function to create a new 'decade' column
data["decade"] = data["year"].apply(get_decade)
# Plot the distribution of songs across decades
sns.set(rc={"figure.figsize": (11, 6)})
sns.countplot(data["decade"])<Axes: xlabel='count', ylabel='decade'>

# Define sound-related features
sound_features = ["acousticness", "danceability", "energy", "instrumentalness", "liveness", "valence"]
# Plot sound features over the years
fig = px.line(year_data, x="year", y=sound_features)
fig.show()# Select the top 10 genres by popularity
top10_genres = genre_data.nlargest(10, "popularity")
# Plot genre characteristics in a grouped bar chart
fig = px.bar(
top10_genres,
x="genres",
y=["valence", "energy", "danceability", "acousticness"],
barmode="group",
)
fig.show()# Define clustering pipeline for genres
cluster_pipeline = Pipeline([
("scaler", StandardScaler()),
("kmeans", KMeans(n_clusters=10))
])
# Select numerical columns for clustering
X = genre_data.select_dtypes(np.number)
# Fit the pipeline and predict clusters
cluster_pipeline.fit(X)
genre_data["cluster"] = cluster_pipeline.predict(X)# Define t-SNE pipeline
tsne_pipeline = Pipeline([
("scaler", StandardScaler()),
("tsne", TSNE(n_components=2, verbose=1))
])
# Transform genre data using t-SNE
genre_embedding = tsne_pipeline.fit_transform(X)
# Create a DataFrame for visualization
projection = pd.DataFrame(columns=["x", "y"], data=genre_embedding)
projection["genres"] = genre_data["genres"]
projection["cluster"] = genre_data["cluster"]
# Plot the clustered genres in a scatter plot
fig = px.scatter(projection, x="x", y="y", color="cluster", hover_data=["x", "y", "genres"])
fig.show()[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2973 samples in 0.005s...
[t-SNE] Computed neighbors for 2973 samples in 0.395s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2973
[t-SNE] Computed conditional probabilities for sample 2000 / 2973
[t-SNE] Computed conditional probabilities for sample 2973 / 2973
[t-SNE] Mean sigma: 0.777516
[t-SNE] KL divergence after 250 iterations with early exaggeration: 76.102470
[t-SNE] KL divergence after 1000 iterations: 1.393200
# Define clustering pipeline for songs
song_cluster_pipeline = Pipeline([
("scaler", StandardScaler()),
("kmeans", KMeans(n_clusters=20, verbose=False))
])
# Select numerical columns for clustering
X = data.select_dtypes(np.number)
number_cols = list(X.columns)
# Fit the pipeline and assign cluster labels
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
data["cluster_label"] = song_cluster_labels# Define PCA pipeline
pca_pipeline = Pipeline([
("scaler", StandardScaler()),
("PCA", PCA(n_components=2))
])
# Transform song data using PCA
song_embedding = pca_pipeline.fit_transform(X)
# Create a DataFrame for visualization
projection = pd.DataFrame(columns=["x", "y"], data=song_embedding)
projection["title"] = data["name"]
projection["cluster"] = data["cluster_label"]
# Plot the clustered songs in a scatter plot
fig = px.scatter(projection, x="x", y="y", color="cluster", hover_data=["title"])
fig.show()